{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#第四周作业：对活动进行聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>interested</th>\n",
       "      <th>not_interested</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1918771225</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1502284248</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3044012</td>\n",
       "      <td>2529072432</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3044012</td>\n",
       "      <td>3072478280</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1390707377</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user       event  invited                         timestamp  interested  \\\n",
       "0  3044012  1918771225        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "1  3044012  1502284248        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "2  3044012  2529072432        0  2012-10-02 15:53:05.754000+00:00           1   \n",
       "3  3044012  3072478280        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "4  3044012  1390707377        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "\n",
       "   not_interested  \n",
       "0               0  \n",
       "1               0  \n",
       "2               0  \n",
       "3               0  \n",
       "4               0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "train = pd.read_csv(\"train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 15398 entries, 0 to 15397\n",
      "Data columns (total 6 columns):\n",
      "user              15398 non-null int64\n",
      "event             15398 non-null int64\n",
      "invited           15398 non-null int64\n",
      "timestamp         15398 non-null object\n",
      "interested        15398 non-null int64\n",
      "not_interested    15398 non-null int64\n",
      "dtypes: int64(5), object(1)\n",
      "memory usage: 721.9+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1776192</td>\n",
       "      <td>2877501688</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1776192</td>\n",
       "      <td>3025444328</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1776192</td>\n",
       "      <td>4078218285</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1776192</td>\n",
       "      <td>1024025121</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1776192</td>\n",
       "      <td>2972428928</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:21.985000+00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user       event  invited                         timestamp\n",
       "0  1776192  2877501688        0  2012-11-30 11:39:01.230000+00:00\n",
       "1  1776192  3025444328        0  2012-11-30 11:39:01.230000+00:00\n",
       "2  1776192  4078218285        0  2012-11-30 11:39:01.230000+00:00\n",
       "3  1776192  1024025121        0  2012-11-30 11:39:01.230000+00:00\n",
       "4  1776192  2972428928        0  2012-11-30 11:39:21.985000+00:00"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "test = pd.read_csv(\"test.csv\")\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 10237 entries, 0 to 10236\n",
      "Data columns (total 4 columns):\n",
      "user         10237 non-null int64\n",
      "event        10237 non-null int64\n",
      "invited      10237 non-null int64\n",
      "timestamp    10237 non-null object\n",
      "dtypes: int64(3), object(1)\n",
      "memory usage: 320.0+ KB\n"
     ]
    }
   ],
   "source": [
    "test.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取活动数据\n",
    "dpath = 'E:/data/'\n",
    "events = pd.read_csv(dpath +\"events.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  c_99  \\\n",
       "0  NaN  NaN    2   ...        0     1     0     0     0     0     0     0   \n",
       "1  NaN  NaN    2   ...        0     0     0     0     0     0     0     0   \n",
       "2  NaN  NaN    0   ...        0     0     0     0     0     0     0     0   \n",
       "3  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "4  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "\n",
       "   c_100  c_other  \n",
       "0      0        9  \n",
       "1      0        7  \n",
       "2      0       12  \n",
       "3      0        8  \n",
       "4      0        9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3137972 entries, 0 to 3137971\n",
      "Columns: 110 entries, event_id to c_other\n",
      "dtypes: float64(2), int64(103), object(5)\n",
      "memory usage: 2.6+ GB\n"
     ]
    }
   ],
   "source": [
    "events.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 用户和活动关联关系处理\n",
    "\n",
    "\n",
    "#整个数据集中活动数目（events.csv）太多，所以下面的处理我们找出只在训练集和测试集中出现的活动和用户集合，并对他们重新编制索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存数据\n",
    "import pickle\n",
    "\n",
    "import itertools\n",
    "\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "    \n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    f = open(filename, 'rb')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().split(\",\")\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "        \n",
    "        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动\n",
    "        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)\n",
    "\n",
    "#用户关系矩阵表，可用于后续LFM/SVD++处理的输入\n",
    "#这是一个稀疏矩阵，记录用户对活动感兴趣\n",
    "userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))\n",
    "userIndex = dict()\n",
    "eventIndex = dict()\n",
    "\n",
    "#重新编码用户索引字典\n",
    "for i, u in enumerate(uniqueUsers):\n",
    "    userIndex[u] = i\n",
    "    \n",
    "#重新编码活动索引字典    \n",
    "for i, e in enumerate(uniqueEvents):\n",
    "    eventIndex[e] = i\n",
    "\n",
    "n_records = 0\n",
    "ftrain = open(\"train.csv\", 'rb')\n",
    "ftrain.readline()\n",
    "for line in ftrain:\n",
    "    cols = line.strip().split(\",\")\n",
    "    i = userIndex[cols[0]]  #用户\n",
    "    j = eventIndex[cols[1]] #活动\n",
    "    \n",
    "    eventsForUser[i].add(j)    #该用户参加了这个活动\n",
    "    usersForEvent[j].add(i)    #该活动被用户参加\n",
    "        \n",
    "    #userEventScores[i, j] = int(cols[4]) - int(cols[5])   #interested - not_interested\n",
    "    score = int(cols[4])\n",
    "    #if score == 0:  #0在稀疏矩阵中表示该元素不存在，因此借用-1表示interested=0\n",
    "    #userEventScores[i, j] = -1\n",
    "    #else:\n",
    "    userEventScores[i, j] = score\n",
    "ftrain.close()\n",
    "\n",
    "  \n",
    "##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户\n",
    "pickle.dump(eventsForUser, open(\"PE_eventsForUser.pkl\", 'wb'))\n",
    "##统计活动参加的用户\n",
    "pickle.dump(usersForEvent, open(\"PE_usersForEvent.pkl\", 'wb'))\n",
    "\n",
    "#保存用户-活动关系矩阵R，以备后用\n",
    "sio.mmwrite(\"PE_userEventScores\", userEventScores)\n",
    "\n",
    "\n",
    "#保存用户索引表\n",
    "pickle.dump(userIndex, open(\"PE_userIndex.pkl\", 'wb'))\n",
    "#保存活动索引表\n",
    "pickle.dump(eventIndex, open(\"PE_eventIndex.pkl\", 'wb'))\n",
    "\n",
    "    \n",
    "# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event\n",
    "# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair\n",
    "# 关联的event指的是至少同一个user有行为的event pair\n",
    "uniqueUserPairs = set()\n",
    "uniqueEventPairs = set()\n",
    "for event in uniqueEvents:\n",
    "    i = eventIndex[event]\n",
    "    users = usersForEvent[i]\n",
    "    if len(users) > 2:\n",
    "        uniqueUserPairs.update(itertools.combinations(users, 2))\n",
    "        \n",
    "for user in uniqueUsers:\n",
    "    u = userIndex[user]\n",
    "    events = eventsForUser[u]\n",
    "    if len(events) > 2:\n",
    "        uniqueEventPairs.update(itertools.combinations(events, 2))\n",
    " \n",
    "#保存用户-事件关系对索引表\n",
    "pickle.dump(uniqueUserPairs, open(\"FE_uniqueUserPairs.pkl\", 'wb'))\n",
    "pickle.dump(uniqueEventPairs, open(\"PE_uniqueEventPairs.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取列表索引\n",
    "eventIndex = pickle.load(open(\"PE_eventIndex.pkl\", 'rb'))\n",
    "n_events=len(eventIndex)\n",
    "eventContMatrix = ss.dok_matrix((n_events, 101))\n",
    "EVENT=open(\"E:/data/events.csv\",'rb')\n",
    "for line in EVENT.readlines():\n",
    "    cols = line.strip().split(\",\")\n",
    "    eventId = str(cols[0])\n",
    "    \n",
    "    if eventIndex.has_key(eventId):  #在训练集或测试集中出现\n",
    "        i = eventIndex[eventId]        \n",
    "        #只读取词频\n",
    "        for j in range(9, 110):\n",
    "            eventContMatrix[i, j-9] = cols[j]\n",
    "EVENT.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "#建立K-mean的function\n",
    "def K_cluster_analysis(K, df):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    km = MiniBatchKMeans(n_clusters = K)\n",
    "    km.fit(df)\n",
    "    \n",
    "    #保存预测结果\n",
    "    cluster_result = km.predict(df)\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(df,cluster_result)   \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "\n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  (3774, 91)\t1.0\n",
      "  (5315, 89)\t1.0\n",
      "  (13405, 54)\t2.0\n",
      "  (2139, 42)\t1.0\n",
      "  (2174, 18)\t1.0\n",
      "  (2535, 96)\t1.0\n",
      "  (4136, 0)\t2.0\n",
      "  (1685, 96)\t1.0\n",
      "  (5597, 78)\t2.0\n",
      "  (11103, 14)\t1.0\n",
      "  (13066, 38)\t1.0\n",
      "  (10409, 22)\t3.0\n",
      "  (10718, 40)\t1.0\n",
      "  (8782, 45)\t1.0\n",
      "  (5519, 97)\t1.0\n",
      "  (4110, 1)\t4.0\n",
      "  (5452, 4)\t1.0\n",
      "  (541, 34)\t1.0\n",
      "  (834, 44)\t2.0\n",
      "  (2117, 64)\t1.0\n",
      "  (8449, 77)\t1.0\n",
      "  (449, 74)\t1.0\n",
      "  (12245, 68)\t1.0\n",
      "  (10583, 78)\t1.0\n",
      "  (5861, 17)\t3.0\n",
      "  :\t:\n",
      "  (6611, 9)\t2.0\n",
      "  (1847, 2)\t27.0\n",
      "  (3419, 6)\t1.0\n",
      "  (9010, 22)\t3.0\n",
      "  (6601, 24)\t1.0\n",
      "  (5514, 51)\t3.0\n",
      "  (9333, 85)\t2.0\n",
      "  (7066, 85)\t3.0\n",
      "  (5349, 14)\t1.0\n",
      "  (2739, 33)\t3.0\n",
      "  (6070, 13)\t3.0\n",
      "  (9485, 100)\t5.0\n",
      "  (4671, 22)\t3.0\n",
      "  (10316, 63)\t2.0\n",
      "  (7053, 71)\t1.0\n",
      "  (8648, 29)\t2.0\n",
      "  (9775, 7)\t1.0\n",
      "  (12959, 2)\t3.0\n",
      "  (8288, 100)\t5.0\n",
      "  (3124, 5)\t7.0\n",
      "  (7357, 25)\t1.0\n",
      "  (12892, 39)\t1.0\n",
      "  (741, 23)\t2.0\n",
      "  (1827, 100)\t14.0\n",
      "  (4589, 43)\t1.0\n"
     ]
    }
   ],
   "source": [
    "print(eventContMatrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.402643615561\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.29911157589\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.229808795762\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.147487121168\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.127976747827\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.112972741417\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.0672636829247\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.0976170836981\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.065586075579\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.0785748967649\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "CH_scores = []\n",
    "Ks = [10,20,30,40,50,60,70,80,90,100]\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, eventContMatrix)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x9bbb46d8>]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xl0VOX9x/H3lyAoioqSWgVpgOKC2oqMiFipO6gVFFHBDZeKqLgrghsVrQriXqtSa1u1iohd0IPFFbe6kCgii0hAhYgKbugPUAS+vz+emzJgIJNkkjsz9/M6Z05yt5lv5tx85s69z/Ncc3dERCQZGsVdgIiINByFvohIgij0RUQSRKEvIpIgCn0RkQRR6IuIJIhCX0QkQRT6IiIJotAXEUmQxnEXsK6WLVt6SUlJ3GWIiOSVsrKyz929uLr1ci70S0pKKC0tjbsMEZG8YmYfZbKeTu+IiCSIQl9EJEEU+iIiCaLQFxFJEIW+iEiCKPRFRBIko9A3s55mNtvMys1s6AbW62tmbmaptHnDou1mm1mPbBQtIiK1U23om1kRcBdwKNAR6G9mHatYrzlwHvBG2ryOQD9gF6An8Mfo+bLum2/g8sth7tz6eHYRkcKQyZF+F6Dc3ee5+wpgLNC7ivWuBUYB36XN6w2Mdffv3f0DoDx6vqxbuhTuuAOGrvd7iIiIZBL6rYAFadMV0bz/MbNOwPbu/mRNt82WbbeFIUNg/Hh49dX6eAURkfyXSehbFfP8fwvNGgG3AhfXdNu05xhoZqVmVrp48eIMSqraxRfDdtuFn/6jVxERkUxCvwLYPm26NbAwbbo5sCsw2cw+BLoCE6KLudVtC4C7j3H3lLuniourHS9ovTbdFK67Dt54A8aNq/XTiIgUrExCfwrQwczamlkTwoXZCZUL3X2Ju7d09xJ3LwFeB3q5e2m0Xj8za2pmbYEOwJtZ/yvSnHwy/PKX4dz+99/X5yuJiOSfakPf3VcCg4FJwCxgnLvPMLMRZtarmm1nAOOAmcB/gHPcfVXdy16/oiIYPRo+/BDuvLM+X0lEJP+Y59jJ71Qq5dkYWvnww8MF3fJyaNkyC4WJiOQwMytz91R16xVsj9xRo+Dbb+Haa+OuREQkdxRs6O+yC5xxBvzxj/D++3FXIyKSGwo29AGuuQY23lgdtkREKhV06G+zTQj8f/4TXnop7mpEROJX0KEPcOGF0Lp16LC1enXc1YiIxKvgQ79ZM/j976G0FMaOjbsaEZF4FXzoA5x4InTqBMOGwfLlcVcjIhKfRIR+o0Zw880wf34YiVNEJKkSEfoA++8PRxwB118PdRjTTUQkryUm9CF02Fq6FH73u7grERGJR6JCf6ed4Mwz4d574b334q5GRKThJSr0IRzlN2sWbrgiIpI0iQv94uJwL90nnoAXXoi7GhGRhpW40Ac4/3xo0wYuuUQdtkQkWRIZ+ptsElrxvPUW/P3vcVcjItJwEhn6AP37QyoVTvUsWxZ3NSIiDSOxoV/ZYauiAm69Ne5qREQaRmJDH6B7dzjySLjxRvjss7irERGpf4kOfYCRI+G772D48LgrERGpf4kP/R12gLPOgj/9CWbOjLsaEZH6lVHom1lPM5ttZuVm9qP7UJnZIDN718ymmtkrZtYxml9iZsuj+VPN7J5s/wHZcPXV0Lw5XHpp3JWIiNSvakPfzIqAu4BDgY5A/8pQT/Owu+/m7rsDo4Bb0pbNdffdo8egbBWeTS1bwhVXwMSJ8OyzcVcjIlJ/MjnS7wKUu/s8d18BjAV6p6/g7t+kTW4KePZKbBjnngslJeEOW6tWxV2NiEj9yCT0WwEL0qYronlrMbNzzGwu4Uj/vLRFbc3sbTN70cz2rVO19WjjjUMrnmnT4IEH4q5GRKR+ZBL6VsW8Hx3Ju/td7t4euAy4Mpr9CdDG3TsBFwEPm9nmP3oBs4FmVmpmpYtjHOz+2GNhr73gyivDEMwiIoUmk9CvALZPm24NLNzA+mOBIwHc/Xt3/yL6vQyYC+yw7gbuPsbdU+6eKi4uzrT2rDMLHbYWLgw/RUQKTSahPwXoYGZtzawJ0A+YkL6CmXVImzwcmBPNL44uBGNm7YAOwLxsFF5f9tkHjj463HDlk0/irkZEJLuqDX13XwkMBiYBs4Bx7j7DzEaYWa9otcFmNsPMphJO4wyI5ncHppnZO8B4YJC7f5n1vyLLRo6EFStCU04RkUJi7rnV0CaVSnlpaWncZXDRRXD77TB1Kuy2W9zViIhsmJmVuXuquvUS3yN3fa68ErbYQh22RKSwKPTXY6ut4KqrYNKk8BARKQQK/Q04+2xo1y7cYUsdtkSkECj0N6Bp03BRd/p0+Mtf4q5GRKTuFPrVOPpo6NYtnOr5v/+LuxoRkbpR6FejssPWp5/CTTfFXY2ISN0o9DPQtSscd1wI/Y8/jrsaEZHaU+hn6IYbwsXcK6+sfl0RkVyl0M9Q27Zw3nnwt7+FDlsiIvlIoV8Dl18OLVqEJpw51pFZRCQjCv0aaNEi3ED9uefgqafirkZEpOYU+jU0aBB06BCO9leujLsaEZGaUejXUJMmocPWrFlw331xVyMiUjMK/Vo48kjYd99wquebb6pfX0QkVyj0a6Gyw9aiReGoX0QkXyj0a2nPPeH44+GWW2DBgurXFxHJBQr9Orj++tB084or4q5ERCQzCv06+NnP4IIL4MEHoaws7mpERKqn0K+jYcOgZUt12BKR/KDQr6MttoDf/Q4mT4Ynnoi7GhGRDVPoZ8HAgbDjjjBkCPzwQ9zViIisX0ahb2Y9zWy2mZWb2dAqlg8ys3fNbKqZvWJmHdOWDYu2m21mPbJZfK7YaCMYNQpmz4YxY+KuRkRk/cyrORFtZkXA+8DBQAUwBejv7jPT1tnc3b+Jfu8FnO3uPaPwfwToAmwHPAvs4O7rveNsKpXy0tLSuv1VMXCHAw4It1YsLw+nfUREGoqZlbl7qrr1MjnS7wKUu/s8d18BjAV6p69QGfiRTYHKT5LewFh3/97dPwDKo+crOJUdtj7/PIy9LyKSizIJ/VZAevejimjeWszsHDObC4wCzqvhtgPNrNTMShcvXpxp7Tlnjz3gpJPgttvCqR4RkVyTSehbFfN+dE7I3e9y9/bAZUDl/aUy3XaMu6fcPVVcXJxBSbnrhhugeXPo1Qu++iruakRE1pZJ6FcA26dNtwYWbmD9scCRtdw277VqBf/4B3zwARx7rFrziEhuyST0pwAdzKytmTUB+gET0lcwsw5pk4cDc6LfJwD9zKypmbUFOgBv1r3s3LbvvnDPPfDss3DhhXFXIyKyRuPqVnD3lWY2GJgEFAH3u/sMMxsBlLr7BGCwmR0E/AB8BQyItp1hZuOAmcBK4JwNtdwpJKedBjNmhAHZdtkFzjor7opERDJostnQ8rXJZlVWrQrn9idNCo8DD4y7IhEpVNlssim1VFQEjzwSeusecwzMmVP9NiIi9UmhX8823zyMydOoERxxBHz9ddwViUiSKfQbQLt28PjjMHcuHHecbqguIvFR6DeQX/8a7r4bnn4aLroo7mpEJKmqbb0j2fPb34YWPbfdFlr0nHlm3BWJSNLoSL+B3XQT9OwJgwfDCy/EXY2IJI1Cv4E1bgxjx0KHDnD00WFEThGRhqLQj8EWW4QWPWZq0SMiDUuhH5P27UOLnvJy6NdPLXpEpGEo9GO0335w112ht+4ll8RdjYgkgVrvxGzgwNCi5/bbQ4ueM86IuyIRKWQ60s8BN98MPXrA2WfD5MlxVyMihUyhnwMqW/S0bx9a9MydG3dFIlKoFPo5YsstQ4se99CiZ8mSuCsSkUKk0M8hHTrA+PFhNM7+/cPQzCIi2aTQzzEHHAB33glPPQWXXhp3NSJSaNR6JwcNGhRa9Nx6a2jRc/rpcVckIoVCR/o56tZb4eCDw20WX3op7mpEpFAo9HNU48bw6KPQti306QPz5sVdkYgUAoV+DmvRIrToWbUqtOj55pu4KxKRfJdR6JtZTzObbWblZja0iuUXmdlMM5tmZs+Z2c/Slq0ys6nRY0I2i0+CHXaAxx6D2bPVokdE6q7a0DezIuAu4FCgI9DfzDqus9rbQMrdfwGMB0alLVvu7rtHj15ZqjtRDjoI7rgDJk6Eyy6LuxoRyWeZHOl3AcrdfZ67rwDGAr3TV3D3F9x9WTT5OtA6u2XK2WeHx803w1/+Enc1IpKvMgn9VsCCtOmKaN76nA48lTa9sZmVmtnrZnZkLWqUyG23wYEHhtssvvxy3NWISD7KJPStinle5YpmJwIp4Ka02W3cPQUcD9xmZu2r2G5g9MFQunjx4gxKSqaNNgrn90tKQoueDz6IuyIRyTeZhH4FsH3adGtg4bormdlBwBVAL3f/vnK+uy+Mfs4DJgOd1t3W3ce4e8rdU8XFxTX6A5KmskXPypXQqxd8+23cFYlIPskk9KcAHcysrZk1AfoBa7XCMbNOwL2EwF+UNr+FmTWNfm8J7APMzFbxSbXjjjBuHMyaBccfrxY9IpK5akPf3VcCg4FJwCxgnLvPMLMRZlbZGucmYDPgsXWaZu4MlJrZO8ALwI3urtDPgoMPDuf4n3wShg2LuxoRyRfmXuXp+dikUikvLS2Nu4y84B5a9NxzT2jRc8opcVckInExs7Lo+ukGqUduHjML7fcPOCC06Hn11bgrEpFcp9DPc5Utetq0gaOOgg8/jLsiEcllCv0CsNVWoUXPihVq0SMiG6bQLxA77RRa9MyYASeeCKtXx12RiOQihX4BOeSQMA7/hAlw+eVxVyMiuUh3ziow554bjvZHjoSOHeHkk+OuSERyiY70C4wZ/OEPsN9+cMYZMG1a3BWJSC5R6BegjTYK5/dbtAjn97/7Lu6KRCRXKPQLVHEx/PnP8O67cMUVcVcjIrlCoV/ADj8cBg2CW26B55+PuxoRyQUK/QI3enS45eKAAfDVV3FXIyJxU+gXuE03hYcegk8/hXPOibsaEYmbQj8B9twTrr4aHnkEHn447mpEJE4K/YQYNgz23juMyjl/ftzViEhcFPoJ0bgxPPhguOHKgAEapkEkqRT6CdK+fbjxyuTJoUWPiCSPQj9hTjsNjjwytN1Xb12R5FHoJ4wZjBkTeuuecIJ664okjUI/gYqL4f77Yfp0jcYpkjQK/YQ67DA466wwFPNzz8VdjYg0lIxC38x6mtlsMys3s6FVLL/IzGaa2TQze87Mfpa2bICZzYkeA7JZvNTN6NGw447qrSuSJNWGvpkVAXcBhwIdgf5m1nGd1d4GUu7+C2A8MCraditgOLAX0AUYbmYtsle+1EWzZqG37mefhaN+97grEpH6lsmRfheg3N3nufsKYCzQO30Fd3/B3ZdFk68DraPfewDPuPuX7v4V8AzQMzulSzakUjB8ODz6qHrriiRBJqHfCliQNl0RzVuf04GnarmtxGDoUOjWLYzN89FHcVcjIvUpk9C3KuZVeSLAzE4EUsBNNdnWzAaaWamZlS5evDiDkiSb1u2tu2pV3BWJSH3JJPQrgO3TplsDC9ddycwOAq4Aern79zXZ1t3HuHvK3VPFxcWZ1i5Z1K4d3H47vPiieuuKFLJMQn8K0MHM2ppZE6AfMCF9BTPrBNxLCPxFaYsmAYeYWYvoAu4h0TzJQaeeCkcdFXrrvvNO3NWISH2oNvTdfSUwmBDWs4Bx7j7DzEaYWa9otZuAzYDHzGyqmU2Itv0SuJbwwTEFGBHNkxxU2Vt3663VW1ekUJnnWDu9VCrlpaWlcZeRaE89FTpvXXBB6LwlIrnPzMrcPVXdeuqRKz9y6KGhJc9tt8Gzz8ZdjYhkk0JfqjRqFOy0E5xyCnypE3IiBUOhL1VSb12RwqTQl/Xq3BmuuQbGjYO//z3uakQkGxT6skGXXQb77KPeuiKFQqEvG1RUFHrrrl4NJ5+s3roi+U6hL9Vq2xbuvBNeegluvjnuakSkLhT6kpEBA6BPH7jySpg6Ne5qRKS2FPqSETO49941vXWXL4+7IhGpDYW+ZKxlS/jrX2HmTBg2LO5qRKQ2FPpSIz16wODBYUTOZ56JuxoRqSmFvtTYyJHqrSuSrxT6UmPNmoXOWosWwaBB6q0rkk8U+lIre+wBI0bAY4+F4RpEJD8o9KXWhgyBX/0q9Nb98MO4qxGRTCj0pdaKiuCBB8Lv6q0rkh8U+lInlb11X34ZRo+OuxoRqY5CX+rs5JPh6KPhqqvg7bfjrkZENkShL3VW2Vu3ZUv11hXJdQp9yYqttw69dWfNgqFD465GRNZHoS9Zc8ghcO65cMcd8PTTcVcjIlXJKPTNrKeZzTazcjP70XGcmXU3s7fMbKWZ9V1n2Sozmxo9JmSrcMlNI0fCzjuH3rpffBF3NSKyrmpD38yKgLuAQ4GOQH8z67jOavOBU4CHq3iK5e6+e/ToVcd6JcdtsknorPX55+qtK5KLMjnS7wKUu/s8d18BjAV6p6/g7h+6+zRgdT3UKHmmsrfu+PHhrlsikjsyCf1WwIK06YpoXqY2NrNSM3vdzI6sagUzGxitU7p48eIaPLXkqksvhX33DSNyfvBB3NWISKVMQt+qmFeTL+1t3D0FHA/cZmbtf/Rk7mPcPeXuqeLi4ho8teSq9N66PXrALbfAggUb3kZE6l8moV8BbJ823RpYmOkLuPvC6Oc8YDLQqQb1SR4rKYFx46B5c7j4YmjTJozVc8cdsDDjPUhEsimT0J8CdDCztmbWBOgHZNQKx8xamFnT6PeWwD7AzNoWK/mnZ08oK4P334frroNvv4Xzz4fWreHXv4Y//hE++yzuKkWSo9rQd/eVwGBgEjALGOfuM8xshJn1AjCzPc2sAjgGuNfMZkSb7wyUmtk7wAvAje6u0E+gDh3giivgnXfC7RaHD4fFi8MIndttBwceCGPGhFY/IlJ/zHOsTV0qlfLS0tK4y5AGMn06PPpoeMyZE64FHHggHHccHHkkbLVV3BWK5AczK4uun26QeuRKrHbdFa69FmbPDoO1DRkC5eVw+unw05/C4YeHC8JLlsRdqUhhUOhLTjCD3XeH668PoT9lClxwAcyYAQMGwE9+Ar17h9s0fvtt3NWK5C+FvuQcM0ilYNSo0Mb/tdfCuf+yMjjxRCguhj59wimhpUvjrlYkvyj0JaeZQdeuoZ3//PnwyiswcGD4IOjXL3wAHHssPP44LFsWd7UiuU+hL3mjUSPYZ5/Qzr+iAiZPDgO7TZ4MffuGU0DHHw///jd8913MxYrkKIW+5KWiojXt/BcuhGefDYE/aVJo9bPNNuGOXk8+CStWxF2tSO5Q6Evea9x4TTv/Tz+F//wn3L7xiSfgiCPCB8Bpp4VvAF9/HXe1IvFSO30pWCtWwDPPhAu+//pXaPXTqBF07hw+JA48ELp1g2bN4q5UpO4ybaev0JdE+P57eP11eO658HjzTVi5Epo0CcF/wAHhQ2DPPWGjjeKuVqTmFPoiG/Dtt/Dyy+ED4PnnYerUMH+zzaB79zXfBHbbLXw7EMl1mYZ+44YoRiTXNG8Ohx0WHhDG/Jk8ec03gYkTw/yWLWH//dd8E/j5z0MzUpF8pSN9kSosWBC+ATz/fPgQ+PjjMH/77dd8ABxwALSqye2EROqRTu+IZIl7GBq68gPghRfgyy/Dsh13XPMBsP/+GiBO4qPQF6knq1eHIaIrrwe89FIYDqJy/KDK6wG/+lW4RiDSEBT6Ig1kxYowQFzl9YDXXoMffgj9B7p2XfNNoGvX0FpIpD4o9EVismxZGCOo8nRQWVk4RdSsWTj6Hzo0nAoSySaNpy8Sk2bN4JBD4MYbwzeAL76Af/4z9Ap+7z046CAYPTp8EIg0NIW+SD1r0SKMB3TnneFOYX36wKWXQv/+GhpaGp5CX6QBNW8O48aFbwGPPRbO85eXx12VJIlCX6SBmcFll4WB4RYuDEM/VHYGE6lvGYW+mfU0s9lmVm5mQ6tY3t3M3jKzlWbWd51lA8xsTvQYkK3CRfLdwQdDaSmUlMBvfhPuFbx6ddxVSaGrNvTNrAi4CzgU6Aj0N7OO66w2HzgFeHidbbcChgN7AV2A4WbWou5lixSGtm3h1VfhhBPg6qvhqKN0E3ipX5kc6XcByt19nruvAMYCvdNXcPcP3X0asO5xSg/gGXf/0t2/Ap4BemahbpGC0awZPPBAuCPYxInQpQvMnBl3VVKoMgn9VsCCtOmKaF4m6rKtSGKYwbnnhnb9X38Ne+0V7vsrkm2ZhH5VYwpm2sI4o23NbKCZlZpZ6eLFizN8apHC0707vPUW7LJLuO/vsGGwalXcVdWPF1+Eq66CuXPjriRZMgn9CmD7tOnWwMIMnz+jbd19jLun3D1VXFyc4VOLFKZWrUIgDhwYmnYedljo4FUoysqgRw/Ybz+47jrYeWe48MI1g9hJ/cok9KcAHcysrZk1AfoBEzJ8/knAIWbWIrqAe0g0T0Q2oGlTuPde+NOfwjj/qdSaG73kq/feg2OOCX9LWVnolTxvHpxySrie0b493HJLuMuZ1CN3r/YBHAa8D8wFrojmjQB6Rb/vSTiqXwp8AcxI2/Y0oDx6nFrda3Xu3NlFZI3XX3dv1cp9k03cH3oo7mpq7qOP3E891b1RI/fNNnO/+mr3JUvWXufdd9179nQH97Zt3R991H316njqzVdAqWeS55ms1JAPhb7Ij336qXv37uE/9vzz3VesiLui6n32Wai1SRP3pk3dL7zQfdGiDW8zaZL7L34R/s6uXd1feaVhai0EmYa+euSK5IFttoFnn4ULLoDbbw8duz77LO6qqrZkSbhA265dGG/opJNgzpxw6qa6S3aHHBIuZN9/P3z0URiVtG9fXezNJoW+SJ7YaCO49VZ46CF4803o3Dn8zBXLl8NNN4Wwv+46OPzw0N/gvvvCbSYzVVQEp54aPiiuuSYMV6GLvdmj0BfJMyecAP/9b7ghy777hlCN0w8/wD33hJvGDxkSOpeVlcGjj4bbSdbWppuGXspz5uhibzYp9EXy0O67h3F79tsPzjgDzjyz4YNw9Wp4+OFwFH7WWWEMoRdfhKeegj32yN7rbLstjBkTblG5995w8cXhNceN0z0JakOhL5KnttoqDNswbFgIxf32g48/rv/XdYcnnggfPCecEO4D/OST4W5h3bvX3+vuumv4e59+OgxRfdxx0K1bGLtIMqfQF8ljRUVw/fVhyIbp08N5/pdfrr/Xe/HFcHG1V69wDv+RR8KF18MPD0NJNISDD15zsXf+/DUXe3Vfgswo9EUKQJ8+8MYbsMUW4Sbsd9yR3VMfZWXQs2f4NvHhh6Hj2MyZ0K8fNIohRSov9r7/PowYES72duwYLvbmW+9ld/jgg3Cq7KGHGuQF42+bn/5QO32R2vv6a/devUI795NOcl+6tG7PN2uWe9++4fm23tp99Gj3ZcuyU2s2ffKJ+xlnhA5gW24Z6vzuu7irqtry5e6vvup+003uffq4//Sn4f0F906dav+8ZNhO3zzHroSkUikvLS2NuwyRvLV6dTjlc/XV8Mtfhpuyl5TU7Dnmzw/NJf/61zD080UXhQuom29eHxVnz/TpoQXRU0+FexXccAMce2zDnXqqysKFobXVa6+Fn2+9BStWhGXt2oXrEnvvHX7uuis0bly71zGzMndPVbueQl+kME2cGC60NmoEY8eGc+HVWbQofGDcfXeYPvtsuPzy6jtV5ZpnnoFLLoFp08Iw1TffDPvsU/+v+8MP4TX/+981Qf/RR2FZ06Zh3KFu3dYE/TbbZO+1FfoiQnl5ON8/Y0YI8yFDqj7qXbIkDIB2663hAu2pp4ZvCm3aNHzN2bJqVbg5zZVXhqPto48Oo5b+/OfZe43PPw/BXnkU/+ab4f2DMFpqesB36hT6VtQXhb6IALB0KZx+eugs1bdvaPXSvHlYtnw5/OEPIQy//DKMgnnttXXrVJVrli4NHbpGjgynVc4+OwwTsfXWNXue1avDxev0o/j33w/LGjcOoV55mqZbt5r1Qs4Ghb6I/I97CL4hQ2CnneCxx0LTzhEjwlFwz57w+99nt1NVrvn0Uxg+PPRgbt48fAM499xw2qUqS5aEFlGVR/Gvvw7ffBOWtWy59lF8KhWufcRJoS8iP/L886FT0+efh+lu3cLFzvrsVJVr0i/2lpSEbznHHBMGdUs/ip8+PXxYmsFuu619FN++fbwXh6ui0BeRKs2fHwZE69073JUr18KroaRf7G3WDJYtC/M333xNwO+9d7gQnOutlkChLyJSrVWr4MEHw6mbzp1D0O+8czwdzuoq09CvZYtQEZH8V1QURvA85ZS4K2k4efh5JiIitaXQFxFJEIW+iEiCZBT6ZtbTzGabWbmZDa1ieVMzezRa/oaZlUTzS8xsuZlNjR73ZLd8ERGpiWov5JpZEXAXcDBQAUwxswnuPjNttdOBr9z952bWDxgJHBctm+vuu2e5bhERqYVMjvS7AOXuPs/dVwBjgd7rrNMb+Fv0+3jgQLOktv4VEcldmYR+K2BB2nRFNK/Kddx9JbAEqBzZoq2ZvW1mL5rZvnWsV0RE6iCTdvpVHbGv26Nrfet8ArRx9y/MrDPwLzPbxd2/WWtjs4HAQIA2+Tysn4hIjssk9CuA9PHiWgML17NOhZk1BrYAvozu5vI9gLuXmdlcYAdgrS637j4GGANgZovN7KNa/C25pCXwedxF5BC9H2vT+7GG3ou11eX9+FkmK2US+lOADmbWFvgY6Accv846E4ABwGtAX+B5d3czKyaE/yozawd0AOZt6MXcPc9u1/BjZlaaSXfopND7sTa9H2vovVhbQ7wf1Ya+u680s8HAJKAIuN/dZ5jZCMI9GScAfwYeNLNy4EvCBwNAd2CEma0EVgGD3P3L+vhDRESkehmNvePuE4GJ68y7Ou3374BjqtjuceDxOtYoIiJZoh659WNM3AXkGL0fa9P7sYbei7XV+/uRc0Mri4hI/dGRvohIgij068jMtjezF8xslpnNMLPzo/lbmdkzZjYn+tki7lobipkVRR3ynoym20ZjMs2JxmhqEneNDcXMtjSz8Wb2XrSP7J3wfePC6P9kupk9YmYbJ2n/MLP7zWyRmU1Pm1fl/mDBHdGYZtN1qZsaAAACx0lEQVTMLCt3MFbo191K4GJ33xnoCpxjZh2BocBz7t4BeC6aTorzgVlp0yOBW6P34ivCWE1JcTvwH3ffCfgl4X1J5L5hZq2A84CUu+9KaA1YOVZXUvaPvwI915m3vv3hUEIz9w6Ezqt3Z6UCd9cjiw/g34TB6WYD20bztgVmx11bA/39raMd9wDgSUJv7c+BxtHyvYFJcdfZQO/F5sAHRNfO0uYndd+oHK5lK0LLwSeBHknbP4ASYHp1+wNwL9C/qvXq8tCRfhZFQ0p3At4AtnH3TwCinz+Jr7IGdRswBFgdTW8NfO1hTCaoeuymQtUOWAz8JTrddZ+ZbUpC9w13/xgYDcwnDNGyBCgjuftHpfXtD5mMe1ZjCv0sMbPNCH0SLvB1xhZKCjP7DbDI3cvSZ1exalKajDUG9gDudvdOwFISciqnKtG56t5AW2A7YFPCKYx1JWX/qE69/O8o9LPAzDYiBP7f3f0f0ezPzGzbaPm2wKK46mtA+wC9zOxDwhDcBxCO/LeMxmSCqsduKlQVQIW7vxFNjyd8CCRx3wA4CPjA3Re7+w/AP4BuJHf/qLS+/SGTcc9qTKFfR9F9A/4MzHL3W9IWVY5HRPTz3w1dW0Nz92Hu3trdSwgX6J539xOAFwhjMkFC3gsAd/8UWGBmO0azDgRmksB9IzIf6GpmzaL/m8r3I5H7R5r17Q8TgJOjVjxdgSWVp4HqQp2z6sjMfgW8DLzLmvPYlxPO648D2hB29mM8QeMOmdl+wCXu/ptosL2xhAt4bwMnuvv3cdbXUMxsd+A+oAlhsMFTCQdbidw3zOwawl31VhL2hd8SzlMnYv8ws0eA/QijaX4GDAf+RRX7Q/TB+AdCa59lwKnuXlrV89aoBoW+iEhy6PSOiEiCKPRFRBJEoS8ikiAKfRGRBFHoi4gkiEJfRCRBFPoiIgmi0BcRSZD/Bxi1xSYpEZ4rAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同聚类数目的模型的性能，找到最佳模型／参数（分数最高）\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "#从图中得出10类样本为最好"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#显示聚类结果\n",
    "#画出聚类结果，每一类用一种颜色\n",
    "colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']\n",
    "\n",
    "n_clusters = 10\n",
    "mb_kmeans = MiniBatchKMeans(n_clusters = n_clusters)\n",
    "mb_kmeans.fit(X_train_pca)\n",
    "\n",
    "y_train_pred = mb_kmeans.labels_\n",
    "cents = mb_kmeans.cluster_centers_#质心\n",
    "\n",
    "for i in range(n_clusters):\n",
    "    index = np.nonzero(y_train_pred==i)[0]\n",
    "    x1 = X_train_pca[index,0]\n",
    "    x2 = X_train_pca[index,1]\n",
    "    y_i = y_train[index]\n",
    "    for j in range(len(x1)):\n",
    "        if j < 20:  #每类打印20个\n",
    "            plt.text(x1[j],x2[j],str(int(y_i[j])),color=colors[i],\\\n",
    "                fontdict={'weight': 'bold', 'size': 9})\n",
    "    #plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)\n",
    "\n",
    "plt.axis([-5,10,-6,6])\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
